{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "c04ffe8e-6573-470f-aef5-348522a0de15",
   "metadata": {},
   "source": [
    "# PII Masking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "efa2a242-27bc-478f-8939-18a7f8153d4f",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:numexpr.utils:Note: NumExpr detected 16 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "Note: NumExpr detected 16 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n",
      "INFO:numexpr.utils:NumExpr defaulting to 8 threads.\n",
      "NumExpr defaulting to 8 threads.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/loganm/miniconda3/envs/llama-index/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import logging\n",
    "import sys\n",
    "\n",
    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n",
    "\n",
    "from llama_index.indices.postprocessor import (\n",
    "    PIINodePostprocessor,\n",
    "    NERPIINodePostprocessor,\n",
    ")\n",
    "from llama_index.llms import HuggingFaceLLM\n",
    "from llama_index import ServiceContext, Document, VectorStoreIndex\n",
    "from llama_index.schema import TextNode"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "216e951a-42c4-4e6b-b16d-6a6064829ebf",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# load documents\n",
    "text = \"\"\"\n",
    "Hello Paulo Santos. The latest statement for your credit card account \\\n",
    "1111-0000-1111-0000 was mailed to 123 Any Street, Seattle, WA 98109.\n",
    "\"\"\"\n",
    "node = TextNode(text=text)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "24495d69-d568-4cc7-9445-87692bf77863",
   "metadata": {},
   "source": [
    "### Option 1: Use NER Model for PII Masking\n",
    "\n",
    "Use a Hugging Face NER model for PII Masking"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "003f66f0-f67f-47f2-88eb-b2bbb6d33791",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "service_context = ServiceContext.from_defaults()\n",
    "processor = NERPIINodePostprocessor(service_context=service_context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e76c995c-57ee-4d1b-a771-6626ef93e8cd",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).\n",
      "Using a pipeline without specifying a model name and revision in production is not recommended.\n",
      "/home/loganm/miniconda3/envs/llama-index/lib/python3.11/site-packages/transformers/pipelines/token_classification.py:169: UserWarning: `grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy=\"AggregationStrategy.SIMPLE\"` instead.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from llama_index.schema import NodeWithScore\n",
    "\n",
    "new_nodes = processor.postprocess_nodes([NodeWithScore(node=node)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d4783c27-9a55-44f1-be9e-a4fe1fc1e0fa",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello [ORG_6]. The latest statement for your credit card account 1111-0000-1111-0000 was mailed to 123 [ORG_108] [LOC_112], [LOC_120], [LOC_129] 98109.'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# view redacted text\n",
    "new_nodes[0].node.get_text()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "075d45dc-a226-4ba7-8c8a-d9dd536f8560",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'[ORG_6]': 'Paulo Santos',\n",
       " '[ORG_108]': 'Any',\n",
       " '[LOC_112]': 'Street',\n",
       " '[LOC_120]': 'Seattle',\n",
       " '[LOC_129]': 'WA'}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get mapping in metadata\n",
    "# NOTE: this is not sent to the LLM!\n",
    "new_nodes[0].node.metadata[\"__pii_node_info__\"]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "06ca1e50-eeee-4079-bec6-3621cb760f98",
   "metadata": {},
   "source": [
    "### Option 2: Use LLM for PII Masking\n",
    "\n",
    "NOTE: You should be using a *local* LLM model for PII masking. The example shown is using OpenAI, but normally you'd use an LLM running locally, possibly from huggingface. Examples for local LLMs are [here](https://gpt-index.readthedocs.io/en/latest/how_to/customization/custom_llms.html#example-using-a-huggingface-llm)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5a2db8d3-6bb7-4855-852e-a4941abb03bf",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "service_context = ServiceContext.from_defaults()\n",
    "processor = PIINodePostprocessor(service_context=service_context)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "b834e7a3-8f90-45eb-841a-335b0d33dcab",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from llama_index.schema import NodeWithScore\n",
    "\n",
    "new_nodes = processor.postprocess_nodes([NodeWithScore(node=node)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ca1498f3-34a1-4001-90f9-03feb5532d7d",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello [NAME]. The latest statement for your credit card account [CREDIT_CARD_NUMBER] was mailed to [ADDRESS].'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# view redacted text\n",
    "new_nodes[0].node.get_text()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d574d591-c1db-498b-ba32-9ed4190c6b4c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'NAME': 'Paulo Santos',\n",
       " 'CREDIT_CARD_NUMBER': '1111-0000-1111-0000',\n",
       " 'ADDRESS': '123 Any Street, Seattle, WA 98109'}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get mapping in metadata\n",
    "# NOTE: this is not sent to the LLM!\n",
    "new_nodes[0].node.metadata[\"__pii_node_info__\"]"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "3444d895-e2fd-4af9-834a-64acf49f74f8",
   "metadata": {},
   "source": [
    "### Feed Nodes to Index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "9d33a9c0-efcd-4e79-b1f5-05aca9fc109f",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "> [build_index_from_nodes] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 30 tokens\n",
      "> [build_index_from_nodes] Total embedding token usage: 30 tokens\n"
     ]
    }
   ],
   "source": [
    "# feed into index\n",
    "index = VectorStoreIndex([n.node for n in new_nodes])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "dc8b1993-d23b-4db1-8bb9-4f882bded66c",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens\n",
      "> [retrieve] Total LLM token usage: 0 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 8 tokens\n",
      "> [retrieve] Total embedding token usage: 8 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 71 tokens\n",
      "> [get_response] Total LLM token usage: 71 tokens\n",
      "INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens\n",
      "> [get_response] Total embedding token usage: 0 tokens\n",
      "\n",
      "[ADDRESS]\n"
     ]
    }
   ],
   "source": [
    "response = index.as_query_engine().query(\"What address was the statement mailed to?\")\n",
    "print(str(response))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2d7e3b8-3495-4314-9ec9-34adc9cc003d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama-index",
   "language": "python",
   "name": "llama-index"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
